import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os
uber_apr14= pd.read_csv(r'C:\Datasets\Uber Analysis\uber-raw-data-apr14.csv',encoding='utf-8')
files=os.listdir(r'C:\Datasets\Uber Analysis')
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
files.remove('uber-raw-data-janjune-15.csv')
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
path=r'C:\Datasets\Uber Analysis'
#blank dataframe
final=pd.DataFrame()
for file in files:
df=pd.read_csv(path+"/"+file,encoding='utf-8')
final=pd.concat([df,final])
final.shape
(4534327, 4)
df=final.copy()
df.head()
| Date/Time | Lat | Lon | Base | |
|---|---|---|---|---|
| 0 | 9/1/2014 0:01:00 | 40.2201 | -74.0021 | B02512 |
| 1 | 9/1/2014 0:01:00 | 40.7500 | -74.0027 | B02512 |
| 2 | 9/1/2014 0:03:00 | 40.7559 | -73.9864 | B02512 |
| 3 | 9/1/2014 0:06:00 | 40.7450 | -73.9889 | B02512 |
| 4 | 9/1/2014 0:11:00 | 40.8145 | -73.9444 | B02512 |
df.shape
(4534327, 4)
df.dtypes
Date/Time object Lat float64 Lon float64 Base object dtype: object
df['Date/Time'] = pd.to_datetime(df['Date/Time'], format="%m/%d/%Y %H:%M:%S")
df.dtypes
Date/Time datetime64[ns] Lat float64 Lon float64 Base object dtype: object
df['weekday']=df['Date/Time'].dt.day_name()
df['day']=df['Date/Time'].dt.day
df['minute']=df['Date/Time'].dt.minute
df['month']=df['Date/Time'].dt.month
df['hour']=df['Date/Time'].dt.hour
df.dtypes
Date/Time datetime64[ns] Lat float64 Lon float64 Base object weekday object day int64 minute int64 month int64 hour int64 dtype: object
df.head()
| Date/Time | Lat | Lon | Base | weekday | day | minute | month | hour | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-09-01 00:01:00 | 40.2201 | -74.0021 | B02512 | Monday | 1 | 1 | 9 | 0 |
| 1 | 2014-09-01 00:01:00 | 40.7500 | -74.0027 | B02512 | Monday | 1 | 1 | 9 | 0 |
| 2 | 2014-09-01 00:03:00 | 40.7559 | -73.9864 | B02512 | Monday | 1 | 3 | 9 | 0 |
| 3 | 2014-09-01 00:06:00 | 40.7450 | -73.9889 | B02512 | Monday | 1 | 6 | 9 | 0 |
| 4 | 2014-09-01 00:11:00 | 40.8145 | -73.9444 | B02512 | Monday | 1 | 11 | 9 | 0 |
df['Base'].unique()
array(['B02512', 'B02598', 'B02617', 'B02682', 'B02764'], dtype=object)
df['day'].unique()
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
dtype=int64)
df['weekday'].unique()
array(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
'Sunday'], dtype=object)
import plotly.express as px
px.bar(x=df['weekday'].value_counts().index,
y= df['weekday'].value_counts().values,
labels={'x':'Weekday','y':'Number of Journeys'},
title='Journeys Per Weekday')
plt.hist(df['hour'])
plt.show()
for i,month in enumerate(df['month'].unique()):
print(month)
9 5 6 7 8 4
plt.figure(figsize=(40,20))
for i,month in enumerate(df['month'].unique()):
plt.subplot(3,2,i+1)
df[df['month']==month]['hour'].hist()
df.groupby('month')
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000016708C00EF0>
px.bar(x = df.groupby('month')['hour'].count().index,
y = df.groupby('month')['hour'].count(),
labels={'x':'Month','y':'Num of Rides'},
title='Rides Per Month')
plt.figure(figsize=(12,6))
plt.hist(df['day'], bins=30, rwidth=.8, range=(0.5, 30.5))
plt.xlabel('Day of The Month')
plt.ylabel('Total Journeys')
plt.title('Journeys by Month Day')
plt.show()
plt.figure(figsize=(20,8))
for i,month in enumerate(df['month'].unique(),1):
plt.subplot(3,2,i)
df_out=df[df['month']==month]
plt.hist(df_out['day'])
plt.xlabel('days in month'.format(i))
plt.ylabel('total rides')
plt.figure(figsize=(10,6))
sns.set_style(style='whitegrid')
sns.pointplot(x="hour",y="Lat",data=df)
plt.show()
Rush hours are in the morning when people are going to work and afternoon when they coming back from work.
plt.figure(figsize=(10,6))
ax=sns.pointplot(x="hour",y="Lat", hue="weekday",data=df)
ax.set_title('Hours of Day vs Latiitide of Passenger')
plt.show()
The rush hours are mainly in the middle of the week between 7AM and 7PM when people are going to work and return home. That is the reason we can observe a decrease in latitude on weekend.
df.head()
| Date/Time | Lat | Lon | Base | weekday | day | minute | month | hour | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-09-01 00:01:00 | 40.2201 | -74.0021 | B02512 | Monday | 1 | 1 | 9 | 0 |
| 1 | 2014-09-01 00:01:00 | 40.7500 | -74.0027 | B02512 | Monday | 1 | 1 | 9 | 0 |
| 2 | 2014-09-01 00:03:00 | 40.7559 | -73.9864 | B02512 | Monday | 1 | 3 | 9 | 0 |
| 3 | 2014-09-01 00:06:00 | 40.7450 | -73.9889 | B02512 | Monday | 1 | 6 | 9 | 0 |
| 4 | 2014-09-01 00:11:00 | 40.8145 | -73.9444 | B02512 | Monday | 1 | 11 | 9 | 0 |
df['Base'].head()
0 B02512 1 B02512 2 B02512 3 B02512 4 B02512 Name: Base, dtype: object
df.groupby(['Base','month'])['Date/Time'].count()
Base month
B02512 4 35536
5 36765
6 32509
7 35021
8 31472
9 34370
B02598 4 183263
5 260549
6 242975
7 245597
8 220129
9 240600
B02617 4 108001
5 122734
6 184460
7 310160
8 355803
9 377695
B02682 4 227808
5 222883
6 194926
7 196754
8 173280
9 197138
B02764 4 9908
5 9504
6 8974
7 8589
8 48591
9 178333
Name: Date/Time, dtype: int64
base=df.groupby(['Base','month'])['Date/Time'].count().reset_index().rename(columns={'Date/Time':'Num of Journeys'})
base
| Base | month | Num of Journeys | |
|---|---|---|---|
| 0 | B02512 | 4 | 35536 |
| 1 | B02512 | 5 | 36765 |
| 2 | B02512 | 6 | 32509 |
| 3 | B02512 | 7 | 35021 |
| 4 | B02512 | 8 | 31472 |
| 5 | B02512 | 9 | 34370 |
| 6 | B02598 | 4 | 183263 |
| 7 | B02598 | 5 | 260549 |
| 8 | B02598 | 6 | 242975 |
| 9 | B02598 | 7 | 245597 |
| 10 | B02598 | 8 | 220129 |
| 11 | B02598 | 9 | 240600 |
| 12 | B02617 | 4 | 108001 |
| 13 | B02617 | 5 | 122734 |
| 14 | B02617 | 6 | 184460 |
| 15 | B02617 | 7 | 310160 |
| 16 | B02617 | 8 | 355803 |
| 17 | B02617 | 9 | 377695 |
| 18 | B02682 | 4 | 227808 |
| 19 | B02682 | 5 | 222883 |
| 20 | B02682 | 6 | 194926 |
| 21 | B02682 | 7 | 196754 |
| 22 | B02682 | 8 | 173280 |
| 23 | B02682 | 9 | 197138 |
| 24 | B02764 | 4 | 9908 |
| 25 | B02764 | 5 | 9504 |
| 26 | B02764 | 6 | 8974 |
| 27 | B02764 | 7 | 8589 |
| 28 | B02764 | 8 | 48591 |
| 29 | B02764 | 9 | 178333 |
plt.figure(figsize=(10,6))
sns.lineplot(x='month',y='Num of Journeys',hue='Base',data=base)
plt.show()
Seems like B02617 and B02764 are showing a sharp increase on Summer months while rest of the baslines remain on the same number of journeys most of the time.
def count_rows(rows):
return len(rows)
by_cross = df.groupby(['weekday','hour']).apply(count_rows)
by_cross
weekday hour
Friday 0 13716
1 8163
2 5350
3 6930
4 8806
...
Wednesday 19 47017
20 47772
21 44553
22 32868
23 18146
Length: 168, dtype: int64
pivot=by_cross.unstack()
pivot
| hour | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| weekday | |||||||||||||||||||||
| Friday | 13716 | 8163 | 5350 | 6930 | 8806 | 13450 | 23412 | 32061 | 31509 | 25230 | ... | 36206 | 43673 | 48169 | 51961 | 54762 | 49595 | 43542 | 48323 | 49409 | 41260 |
| Monday | 6436 | 3737 | 2938 | 6232 | 9640 | 15032 | 23746 | 31159 | 29265 | 22197 | ... | 28157 | 32744 | 38770 | 42023 | 37000 | 34159 | 32849 | 28925 | 20158 | 11811 |
| Saturday | 27633 | 19189 | 12710 | 9542 | 6846 | 7084 | 8579 | 11014 | 14411 | 17669 | ... | 31418 | 38769 | 43512 | 42844 | 45883 | 41098 | 38714 | 43826 | 47951 | 43174 |
| Sunday | 32877 | 23015 | 15436 | 10597 | 6374 | 6169 | 6596 | 8728 | 12128 | 16401 | ... | 28151 | 31112 | 33038 | 31521 | 28291 | 25948 | 25076 | 23967 | 19566 | 12166 |
| Thursday | 9293 | 5290 | 3719 | 5637 | 8505 | 14169 | 27065 | 37038 | 35431 | 27812 | ... | 36699 | 44442 | 50560 | 56704 | 55825 | 51907 | 51990 | 51953 | 44194 | 27764 |
| Tuesday | 6237 | 3509 | 2571 | 4494 | 7548 | 14241 | 26872 | 36599 | 33934 | 25023 | ... | 34846 | 41338 | 48667 | 55500 | 50186 | 44789 | 44661 | 39913 | 27712 | 14869 |
| Wednesday | 7644 | 4324 | 3141 | 4855 | 7511 | 13794 | 26943 | 36495 | 33826 | 25635 | ... | 35148 | 43388 | 50684 | 55637 | 52732 | 47017 | 47772 | 44553 | 32868 | 18146 |
7 rows × 24 columns
plt.figure(figsize=(10,6))
sns.heatmap(pivot, annot=False)
plt.title('Journeys by Weekday and Hour')
plt.show()
df.head()
| Date/Time | Lat | Lon | Base | weekday | day | minute | month | hour | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-09-01 00:01:00 | 40.2201 | -74.0021 | B02512 | Monday | 1 | 1 | 9 | 0 |
| 1 | 2014-09-01 00:01:00 | 40.7500 | -74.0027 | B02512 | Monday | 1 | 1 | 9 | 0 |
| 2 | 2014-09-01 00:03:00 | 40.7559 | -73.9864 | B02512 | Monday | 1 | 3 | 9 | 0 |
| 3 | 2014-09-01 00:06:00 | 40.7450 | -73.9889 | B02512 | Monday | 1 | 6 | 9 | 0 |
| 4 | 2014-09-01 00:11:00 | 40.8145 | -73.9444 | B02512 | Monday | 1 | 11 | 9 | 0 |
def heatmap(col1,col2):
by_cross = df.groupby([col1,col2]).apply(lambda x:len(x))
pivot=by_cross.unstack()
plt.figure(figsize=(10,6))
return sns.heatmap(pivot,annot=False)
## validating above Analysis through Heatmap
heatmap('day','hour')
plt.show()
heatmap('day','month')
plt.show()
df[df['month']==4]
| Date/Time | Lat | Lon | Base | weekday | day | minute | month | hour | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-04-01 00:11:00 | 40.7690 | -73.9549 | B02512 | Tuesday | 1 | 11 | 4 | 0 |
| 1 | 2014-04-01 00:17:00 | 40.7267 | -74.0345 | B02512 | Tuesday | 1 | 17 | 4 | 0 |
| 2 | 2014-04-01 00:21:00 | 40.7316 | -73.9873 | B02512 | Tuesday | 1 | 21 | 4 | 0 |
| 3 | 2014-04-01 00:28:00 | 40.7588 | -73.9776 | B02512 | Tuesday | 1 | 28 | 4 | 0 |
| 4 | 2014-04-01 00:33:00 | 40.7594 | -73.9722 | B02512 | Tuesday | 1 | 33 | 4 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 564511 | 2014-04-30 23:22:00 | 40.7640 | -73.9744 | B02764 | Wednesday | 30 | 22 | 4 | 23 |
| 564512 | 2014-04-30 23:26:00 | 40.7629 | -73.9672 | B02764 | Wednesday | 30 | 26 | 4 | 23 |
| 564513 | 2014-04-30 23:31:00 | 40.7443 | -73.9889 | B02764 | Wednesday | 30 | 31 | 4 | 23 |
| 564514 | 2014-04-30 23:32:00 | 40.6756 | -73.9405 | B02764 | Wednesday | 30 | 32 | 4 | 23 |
| 564515 | 2014-04-30 23:48:00 | 40.6880 | -73.9608 | B02764 | Wednesday | 30 | 48 | 4 | 23 |
564516 rows × 9 columns
heatmap('weekday','month')
plt.show()
uber_15 = pd.read_csv(r'C:\Datasets\Uber Analysis\uber-raw-data-janjune-15.csv',encoding='utf-8')
uber_15.head()
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | |
|---|---|---|---|---|
| 0 | B02617 | 2015-05-17 09:47:00 | B02617 | 141 |
| 1 | B02617 | 2015-05-17 09:47:00 | B02617 | 65 |
| 2 | B02617 | 2015-05-17 09:47:00 | B02617 | 100 |
| 3 | B02617 | 2015-05-17 09:47:00 | B02774 | 80 |
| 4 | B02617 | 2015-05-17 09:47:00 | B02617 | 90 |
uber_15.shape
(14270479, 4)
#Checking the minimum date in the uber_15
uber_15['Pickup_date'].min()
'2015-01-01 00:00:05'
#Checking the maximum date in the uber_15
uber_15['Pickup_date'].max()
'2015-06-30 23:59:00'
uber_15['Pickup_date'] = pd.to_datetime(uber_15['Pickup_date'], format='%Y-%m-%d %H:%M:%S')
uber_15['weekday']=uber_15['Pickup_date'].dt.day_name()
uber_15['day']=uber_15['Pickup_date'].dt.day
uber_15['minute']=uber_15['Pickup_date'].dt.minute
uber_15['month']=uber_15['Pickup_date'].dt.month
uber_15['hour']=uber_15['Pickup_date'].dt.hour
uber_15.head()
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | weekday | day | minute | month | hour | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | B02617 | 2015-05-17 09:47:00 | B02617 | 141 | Sunday | 17 | 47 | 5 | 9 |
| 1 | B02617 | 2015-05-17 09:47:00 | B02617 | 65 | Sunday | 17 | 47 | 5 | 9 |
| 2 | B02617 | 2015-05-17 09:47:00 | B02617 | 100 | Sunday | 17 | 47 | 5 | 9 |
| 3 | B02617 | 2015-05-17 09:47:00 | B02774 | 80 | Sunday | 17 | 47 | 5 | 9 |
| 4 | B02617 | 2015-05-17 09:47:00 | B02617 | 90 | Sunday | 17 | 47 | 5 | 9 |
px.bar(x=uber_15['month'].value_counts().index,
y=uber_15['month'].value_counts().values,
labels={'x':'Month','y':'Num of Pickups'},
title=('Num of Pickups in NYC'))
plt.figure(figsize=(12,6))
sns.countplot(uber_15['hour'])
plt.show()
c:\users\sl149\appdata\local\programs\python\python36\lib\site-packages\seaborn\_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
uber_15.groupby(['weekday', 'hour'])['Pickup_date'].count()
weekday hour
Friday 0 85939
1 46616
2 28102
3 19518
4 23575
...
Wednesday 19 143751
20 136003
21 133993
22 127026
23 99490
Name: Pickup_date, Length: 168, dtype: int64
uber_15.groupby(['weekday', 'hour'])['Pickup_date'].count().reset_index()
| weekday | hour | Pickup_date | |
|---|---|---|---|
| 0 | Friday | 0 | 85939 |
| 1 | Friday | 1 | 46616 |
| 2 | Friday | 2 | 28102 |
| 3 | Friday | 3 | 19518 |
| 4 | Friday | 4 | 23575 |
| ... | ... | ... | ... |
| 163 | Wednesday | 19 | 143751 |
| 164 | Wednesday | 20 | 136003 |
| 165 | Wednesday | 21 | 133993 |
| 166 | Wednesday | 22 | 127026 |
| 167 | Wednesday | 23 | 99490 |
168 rows × 3 columns
summary=uber_15.groupby(['weekday', 'hour'])['Pickup_date'].count().reset_index()
summary=summary.rename(columns = {'Pickup_date':'Counts'})
summary
| weekday | hour | Counts | |
|---|---|---|---|
| 0 | Friday | 0 | 85939 |
| 1 | Friday | 1 | 46616 |
| 2 | Friday | 2 | 28102 |
| 3 | Friday | 3 | 19518 |
| 4 | Friday | 4 | 23575 |
| ... | ... | ... | ... |
| 163 | Wednesday | 19 | 143751 |
| 164 | Wednesday | 20 | 136003 |
| 165 | Wednesday | 21 | 133993 |
| 166 | Wednesday | 22 | 127026 |
| 167 | Wednesday | 23 | 99490 |
168 rows × 3 columns
plt.figure(figsize=(10,6))
sns.pointplot(x="hour", y="Counts", hue="weekday", data=summary)
plt.show()
Like in our previous analysis, we can observe that in the middle of the week the morning rush is basicaly the same while on weekend it drops down because people stay home. On the other hand, there is a slight increase in demand on afternoon and evening because people tend to travel and hangout.